{--
    Support for working with regular expressions.

    This exposes @java.util.regex.Pattern@ as 'Regex' and 
    @java.util.regex.MatchResult@ as 'MatchResult'.

    This module must have been imported during compilation or certain
    language features can not be used (see below).

    ## Frege language support

    ### Regex literals

    Regular expression literals have type 'Regex' and are written:

    > ´\b(foo|bar)\b´       -- string enclosed in grave accents
    > '\w+'                 -- string with length > 1 enclosed in apostrophes

    The notation with the apostrophes has been introduced because many
    have a hard time entering a grave accent mark on their terminal. However,
    it is not possible to write a regular expressions with length 1 this way,
    because then the literal gets interpreted as 'Char' literal.
    (One can write something like @\'(?:X)\'@ for a 'Regex' that matches a single @\'X\'@).
    
    The advantage of using literals rather than compiling string constants 
    to patterns at runtime is 

    1) they are checked by the Frege
    compiler, and illegal ones cause compilation to fail

    2) at runtime, literals are available as immutable values of type 
    @java.util.regex.Pattern@, not as strings, as can be seen so often in 
    human written Java code. Thus the cost of repeated regex compilation
    is avoided.

    3) The regular expression can be written directly, like in Perl or
    other languages, without taking String escape rules into account. That is,
    to match a $ sign, one writes

    > ´\$´ 

    and not 

    > "\\$"

    like in Java.

    The 'Regex' values created from the literals have the following flags enabled:
    'Regex.canon_eq', 'Regex.unicode_case' and 'Regex.unicode_character_class'.
    That is, they do their best - as far as Java supports it - to get processing
    of Unicode strings right.

    ### 'Regex' literals in pattern matching

    A 'Regex' literal is a valid pattern, like any literal:

    > case expr1 of
    >   ´\w+´  →  expr2

    This is equivalent to:

    > case expr1 of
    >   s | s ~ ´\w+´ →  expr2

    where _s_ is some fresh variable that doesn't appear in _expr2_.

    The match succeeds if the 'String' value resulting from evaluation of _expr1_
    matches the regular expression given. The match diverges when _expr1_ diverges.
    In all other cases, the match fails.

    Note that in this simple form, the value of _expr1_ is not available in _expr2_.
    One can use an "at" pattern for this:

    > case expr1 of
    >    string@'\w+'  →  expr2

    which gets desugared to

    > case expr1 of
    >   string | string ~ '\w+'  →  expr2

    leaving _string_ bound to the value of _expr1_.
    
    The following syntax not only checks if the regular expression matches but
    also binds the 'MatchResult':
    
    > case expr1 of
    >   m~'\w+'  →  expr2
    
    It is equivalent to
    
    > case expr1 of
    >   s | Just m ← s =~ '\w+'  →  expr2
    
    where _s_ is again a fresh variable not known in _expr2_, but 
    when the match succeeds _m_ is bound
    to the 'MatchResult' and can be inspected further in _expr2_.
    
    And finally, a pattern like
    
    > s@m~'\w+'
    
    works like the previous one but in addition binds _s_ to the string value 
    in question. 
-}

protected package frege.java.util.Regex where

infix  15 `=~` `=~~` `!~`  `~` `~~` `~~~` `~~*`


import frege.prelude.PreludeBase
import frege.control.Semigroupoid
import Java.Lang
import frege.prelude.PreludeIO
import frege.prelude.PreludeArrays
import frege.prelude.PreludeList


{--
    'Regex' values are based on Java's @java.util.regex.Pattern@ objects,
    which are compiled representations of regular expressions.
    
    We call them 'Regex' and not @Pattern@ in Frege, because the 
    word "pattern" and the phrase "pattern matching" have a quite
    different meaning in deconstruction of algebraic data types. 
    
    There are 2 ways to obtain a compiled regular expression:
    1. Use a 'Regex' literal, see the language reference section 2.7.3.
       All regular expression literals are values of type 'Regex'. 
       It is not possible to run a program which contains a syntactically
       invalid 'Regex' literal - 
       the compiler will flag it already at compile time.
       Hence, with 'Regex' literals one is immune against 'PatternSyntaxException's.
    2. Compile a String with 'regcomp' or 'regforce' at run-time.
       Illegal regular expression will cause 'PatternSyntaxException'
       to be thrown, which, when not handled appropriately, may crash
       the program.
    
    'Regex' values are immutable.  
 -}
data Regex = native java.util.regex.Pattern where

    -- create a 'Matcher' from a regular expression and a 'String'
    -- pure native matcher :: Regex -> String -> Matcher

    {--
        Split a string around matches of a regular expression and return
        the result as list of strings.

        Empty strings on the start of the string are preserved, but not those
        on the end of the string, e.g.:

        >  ´,´.splitted ",,a,b,c,," == ["", "", "a", "b", "c"]
    -}
    splitted rgx = JArray.toList . split rgx
    
    --- @java.util.regex.Pattern.split@ method
    --- As this returns a 'String' array, one usually uses 'Regex.splitted' 
    pure native split split  :: Regex -> String -> JArray String
    
    --- return the source of this 'Regex'
    pure native pattern :: Regex -> String
    
    --- return the flags this pattern was compiled with
    pure native flags :: Regex -> Int
    
    --- Enables UNIX lines mode.
    --- In this mode, only the @\n@ line terminator is recognized in the behavior of @.@, @^@, and @$@.
    --- Unix lines mode can also be enabled via the embedded flag expression @(?d)@.
    pure native unix_lines java.util.regex.Pattern.UNIX_LINES :: Int

    {--
        Enables case-insensitive matching.

        By default, case-insensitive matching assumes that only characters 
        in the US-ASCII charset are being matched. 
        Unicode-aware case-insensitive matching can be enabled by 
        specifying the 'Regex.unicode_case' flag in conjunction with this flag.

        Case-insensitive matching can also be enabled via 
        the embedded flag expression (?i).

        Specifying this flag may impose a slight performance penalty.
    -}
    pure native case_insensitive java.util.regex.Pattern.CASE_INSENSITIVE :: Int 

    {--
        Permits whitespace and comments in pattern.

        In this mode, whitespace is ignored, and embedded comments 
        starting with @#@ are ignored until the end of a line.

        Comments mode can also be enabled via the embedded flag 
        expression @(?x)@.
    -}
    pure native comments java.util.regex.Pattern.COMMENTS :: Int

    {--
        Enables multiline mode.

        In multiline mode the expressions @^@ and @$@ match just 
        after or just before, respectively, 
        a line terminator or the end of the input sequence. 
        By default these expressions only match 
        at the beginning and the end of the entire input sequence.

        Multiline mode can also be enabled via the embedded flag 
        expression @(?m)@.
    -}
    pure native multiline java.util.regex.Pattern.MULTILINE :: Int

    {--
        Enables literal parsing of the pattern.

        When this flag is specified then the input string that 
        specifies the pattern is treated as a sequence of 
        literal characters. 
        Metacharacters or escape sequences in the input sequence 
        will be given no special meaning.

        The flags 'Regex.case_insensitive' and 'Regex.unicode_case' retain their 
        impact on matching when used in conjunction with this flag. 
        The other flags become superfluous.

        There is no embedded flag character for enabling literal parsing.
    -}
    pure native literal java.util.regex.Pattern.LITERAL :: Int
    
    {--
        Enables dotall mode.

        In dotall mode, the expression @.@ matches any character, 
        including a line terminator. 
        By default this expression does not match line terminators.

        Dotall mode can also be enabled via the embedded flag 
        expression @(?s)@. 
        (The @s@ is a mnemonic for "single-line" mode, 
        which is what this is called in Perl.)
    -}
    pure native dotall java.util.regex.Pattern.DOTALL :: Int

    {--
        Enables Unicode-aware case folding.

        When this flag is specified then case-insensitive matching, 
        when enabled by the 'Regex.case_insensitive' flag, 
        is done in a manner consistent with the Unicode Standard. 
        By default, case-insensitive matching assumes that only 
        characters in the US-ASCII charset are being matched.

        Unicode-aware case folding can also be enabled via the 
        embedded flag expression @(?u)@.

        Specifying this flag may impose a performance penalty.

        Because Frege 'Regex' literals will always be parsed with 
        the 'Regex.unicode_character_class' flag, this flag is also
        implicitly enabled by default.

        It can be disabled with the embedded flag expression @(?-u)@.
    -}
    pure native unicode_case java.util.regex.Pattern.UNICODE_CASE :: Int
    
    {--
        Enables canonical equivalence.

        When this flag is specified then two characters will be 
        considered to match if, and only if, 
        their full canonical decompositions match. 
        The expression @´a\u030A´@, for example, 
        will match the string @"\u00E5"@ when this flag is specified. 

        There is no embedded flag character for enabling canonical equivalence.

        Specifying this flag may impose a performance penalty.

        Frege 'Regex' literals will always be parsed with this flag.
    -}
    pure native canon_eq java.util.regex.Pattern.CANON_EQ :: Int

    {--
        Enables the Unicode version of Predefined character classes and POSIX character classes.

        When this flag is specified then the (US-ASCII only) 
        Predefined character classes and POSIX character classes are 
        in conformance with 
        'http://www.unicode.org/reports/tr18/ Unicode Technical Standard #18: Unicode Regular Expression' 
        Annex C: Compatibility Properties.

        The 'Regex.unicode_character_class' mode can also be enabled 
        via the embedded flag expression @(?U)@.

        The flag implies 'Regex.unicode_case', that is, it enables 
        Unicode-aware case folding.

        Specifying this flag may impose a performance penalty.

        Frege 'Regex' literals will always be parsed with this flag.
        It can be disabled with the embedded flag expression @(?-U)@.
    -}
    pure native  unicode_character_class  java.util.regex.Pattern.UNICODE_CHARACTER_CLASS :: Int

    --- compile a 'String' to a 'Regex'
    --- Returns 'either' a  'PatternSyntaxException' in 'Left' or a 'Regex' in 'Right'.
    {-- 
        The overloaded form has an additional 'Int' argument which is a set of flags.
        Such a set can be constructed by adding up the individual flag values, for example:

        >  Regex.compile "foo|bar" (Regex.canon_eq + Regex.case_insensitive)

    -}
    pure native compile java.util.regex.Pattern.compile 
            :: String        -> (PatternSyntaxException|Regex)
            |  String -> Int -> (PatternSyntaxException|Regex)
    quote = String.quote
    
    --- Returns the string representation of this pattern. 
    --- This is the regular expression from which this pattern was 'Regex.compile'd.
    pure native toString :: Regex -> String    

--- Will be thrown when one compiles a 'String' to a 'Regex' using 'regcomp' or 'regforce'
protected data PatternSyntaxException = native java.util.regex.PatternSyntaxException
derive Exceptional PatternSyntaxException

regcomp = Regex.compile

{-- 
    compile a 'String' to a 'Regex'
    
    This will throw 'PatternSyntaxException' when the regular expression is illegal. 

    Use this only if you're sure that pattern compilation will not throw
    an exception, or if you don't care.
-}
pure native regforce java.util.regex.Pattern.compile 
        :: String        -> Regex
        |  String -> Int -> Regex 



--- this is only used to inject the replace functions into 'String'
protected class Replace s where
    {--
        Replaces the first subsequence of the input sequence that matches
        the pattern with the given replacement string.
 
        This method scans the input sequence from the start
        looking for a match of the pattern.
        Characters that are not part of the match are appended directly
        to the result string; the match is replaced in the result by the
        replacement string. The replacement string may contain references
        to captured subsequences.
 
        Note that backslashes (\\) and dollar signs ($) in the replacement
        string may cause the results to be different than if it were
        being treated as a literal replacement string.
        Dollar signs may be treated as references to captured subsequences
        as described above, and backslashes are used to escape
        literal characters in the replacement string.
 
        For example
        
        > replaceFirst "zzzdogzzzdogzzz" ´dog´ "cat" == "zzzcatzzzdogzzz"
    -}
    protected replaceFirst  :: s -> Regex -> s -> s
    
    {--
        Like 'replaceFirst', but replaces *all* substrings that match the pattern.
    -}
    protected replaceAll    :: s -> Regex -> s -> s

instance Replace String where
    {--
        Replaces the first subsequence of the input sequence that matches
        the pattern with the given replacement string.
 
        This method scans the input sequence from the start
        looking for a match of the pattern.
        Characters that are not part of the match are appended directly
        to the result string; the match is replaced in the result by the
        replacement string. The replacement string may contain references
        to captured subsequences.
 
        Note that backslashes (\\) and dollar signs ($) in the replacement
        string may cause the results to be different than if it were
        being treated as a literal replacement string.
        Dollar signs may be treated as references to captured subsequences
        as described above, and backslashes are used to escape
        literal characters in the replacement string.
 
        For example
        
        > "zzzdogzzzdogzzz".replaceFirst  ´dog´ "cat" == "zzzcatzzzdogzzz"
    -}
    pure native replaceFirst frege.runtime.Regex9.replaceFirst :: String -> Regex -> String -> String
    
    {--
        Like 'String.replaceFirst', but replaces *all* substrings that match the pattern.
    -}
    pure native replaceAll   frege.runtime.Regex9.replaceAll   :: String -> Regex -> String -> String

substituteFirst = String.replaceFirst
substituteAll   = String.replaceAll

{--
    This interface contains query methods used to determine the results 
    of a match against a regular expression.

    More: 'https://docs.oracle.com/javase/7/docs/api/java/util/regex/MatchResult.html JavaDoc'
-}
data MatchResult = pure native java.util.regex.MatchResult where
    {--
        Returns the number of capturing groups in this matcher's pattern.

        Group zero denotes the entire pattern by convention. 
        It is not included in this count.

        Any non-negative integer smaller than or equal to the value 
        returned by this method is guaranteed to be a 
        valid group index for this matcher. 
    -}
    pure native groupCount :: MatchResult -> Int
    
    {--
        > group m n
        retrieves the input subsequence captured by the given group
        index during the previous match operation.

        Capturing groups are indexed from left to right, starting at one.
        Group zero denotes the entire pattern, so the expression @(m.group 0)@
        retrieves that portion of the input string that was matched by the pattern.

        If the match was successful but the group specified failed to match any
        part of the input sequence, then 'Nothing' is returned. 
        
        Note that some
        groups, for example @(a?)@, match the empty string.
        This function will return @Just ""@ when such a group successfully
        matches the empty string in the input.

        The result will also be 'Nothing' when the specified group
        does not exist, that is, when the 'Int' argument is outside
        the range @[0..@'MatchResult.groupCount' @m]@   

        The following property holds for a 'MatchResult' _m_ with input
        sequence _s_ and valid group index _g_:
        > isJust (m.group g) ==> (m.group g) == Just (s.substr (m.start g) (m.end g))
    -}
    group m n = case groupPrim m n of
                Left _   -> Nothing
                Right it -> it 
    pure  native groupPrim  group  :: MatchResult -> Int -> (IndexOutOfBoundsException|Maybe String)
    
    {--
        The list of subsequences captured during the previous match operation,
        or the empty list if there was no match.

        To be used like:

        > groups $ s =~ regex

        See also '=~~'
    -}
    groups = maybe [] all
        where all m = map (group m) [0..groupCount m]
 
    {--
        @MatchResult.match m@ returns the input subsequence matched by the previous match.
        The result is 'undefined' if the last match was not successful.

        For a 'MatchResult' @m@ with input sequence @s@, the following holds:
        > isJust (m.group 0) ==> unJust (m.group 0) == m.match

        Note that some patterns, for example @a?@, match the empty string.
        This method will return the empty string when the pattern successfully
        matches the empty string in the input.
    -}
    pure  native match group          :: MatchResult -> String

    {--
        Returns the start index of the subsequence captured by the
        given group during the previous match operation where group 0 denotes
        the entire pattern.

        If the specified capturing group failed to match,
        the return value will be -1.

        The following property holds:
        > (m.group n == Nothing) ==> (m.start n < 0)
    -}
    pure  native start                :: MatchResult -> Int -> Int

    {--
        Returns the offset after the last character of the subsequence
        captured by the
        given group during the previous match operation where group 0 denotes
        the entire pattern.

        If the specified capturing group failed to match,
        the return value will be -1.

        The following property holds:
        > (m.group n == Nothing) ==> (m.end n < 0)
    -}
    pure  native end                  :: MatchResult -> Int -> Int

-- ### several regex and matcher functions #####

{--
 * > string =~ regex
 * tries to match _string_ against _regex_ and returns
 * @Just matcher@ if it succeeds, @Nothing@ otherwise.
 -}
pure native =~ frege.runtime.Regex9.findResult :: String -> Regex -> Maybe MatchResult 
-- (s) =~ (p::Regex) = (p.matcher s).find

{--
    > findAt string regex offset
    
    Like '=~', but matches only the substring of _string_ that starts at _offset_
    against _regex_.
    
    If _offset_ is  negative or not smaller than the length of _string_, the 
    result is _Nothing_
-}
pure native findAt frege.runtime.Regex9.findResult :: String -> Regex -> Int -> Maybe MatchResult

{--
    > string =~~ regex
    tries to match _string_ against _regex_ and returns
    a list of all captured substrings, see 'MatchResult.groups'.
    
    Example:
    
    > case "today is 2014-02-18" =~~ ´(\d\d\d\d)-(0[1-9]|1[012])-(\d\d)´ of
    >    [Just date, Just year, Just month, Just day] -> ...
    >    baddate -> ....
 -}
(s) =~~ (p::Regex) = MatchResult.groups (s =~ p) 

{--
 * >string ~ regex
 * @true@ if _string_ matches _regex_, @false@ otherwise
 -}
pure native ~ frege.runtime.Regex9.find :: String -> Regex -> Bool

{--
    > s !~ p == !(s ~ p)
-}
(s) !~ (p) = !(s ~ p)

{--
    > ("string" ~~ ´r..´) == Just "rin"
    Tries a match and returns @Just x@ where
    _x_ is the matched substring or @Nothing@ if there was no match.
-}
s ~~ r =
    case s =~ r of
        Just m -> m.group 0
        Nothing -> Nothing


{--
 * > string ~~~ regex
 * Matches _string_ with _regex_ and returns a function
 * that can be used to extract the matched part of the string and the
 * captured substrings.
 * >   let f = "frege" ~~~ ´(..).(..)´
 * >   in [ f i | i <- 0..3 ]
 * yields
 * >[Just "frege", Just "fr", Just "ge", Nothing]
-}
s ~~~ r =
    case s =~ r of
        Just m  -> m.group
        Nothing -> const Nothing

{--
    > "cats and dogs are not concatenated." ~~* ´cat|dog´
    Matches the string with the regex and returns a list of all matches.
    The example gives:
    > ["cat", "dog", "cat"]
-}
s ~~* r = go 0 where
    go n = case findAt s r n of
        Nothing -> []
        Just m  -> m.match : go (m.end 0)

